library(ggplot2)
library(ggthemes)
library(dplyr)
library(viridis)
library(tidyr)
library(cluster)
library(ggmap)
library(maps)

Load the Dataset and Check for duplicated records

toronto <- read.csv('data/Major_Crime_Indicators_Open_Data.csv')
head(toronto)
##           X        Y OBJECTID EVENT_UNIQUE_ID            REPORT_DATE
## 1 -79.42590 43.75735        1  GO-20141262074 2014/01/01 05:00:00+00
## 2 -79.35023 43.64629        2  GO-20141260701 2014/01/01 05:00:00+00
## 3 -79.37650 43.66642        3  GO-20141260889 2014/01/01 05:00:00+00
## 4 -85.48874  0.00000        4  GO-20141260973 2014/01/01 05:00:00+00
## 5 -79.34484 43.67895        5  GO-20141261050 2014/01/01 05:00:00+00
## 6 -79.39184 43.64664        6  GO-20141259344 2014/01/01 05:00:00+00
##                 OCC_DATE REPORT_YEAR REPORT_MONTH REPORT_DAY REPORT_DOY
## 1 1998/06/01 04:00:00+00        2014      January          1          1
## 2 2014/01/01 05:00:00+00        2014      January          1          1
## 3 2014/01/01 05:00:00+00        2014      January          1          1
## 4 2014/01/01 05:00:00+00        2014      January          1          1
## 5 2014/01/01 05:00:00+00        2014      January          1          1
## 6 2014/01/01 05:00:00+00        2014      January          1          1
##   REPORT_DOW REPORT_HOUR OCC_YEAR OCC_MONTH OCC_DAY OCC_DOY    OCC_DOW OCC_HOUR
## 1 Wednesday           12       NA                NA      NA                  12
## 2 Wednesday            3     2014   January       1       1 Wednesday         3
## 3 Wednesday            4     2014   January       1       1 Wednesday         4
## 4 Wednesday            4     2014   January       1       1 Wednesday         4
## 5 Wednesday            4     2014   January       1       1 Wednesday         4
## 6 Wednesday            2     2014   January       1       1 Wednesday         2
##   DIVISION                                                     LOCATION_TYPE
## 1      D32                                  Apartment (Rooming House, Condo)
## 2      D51 Commercial Dwelling Unit (Hotel, Motel, B & B, Short Term Rental)
## 3      D51                                  Apartment (Rooming House, Condo)
## 4      NSA             Streets, Roads, Highways (Bicycle Path, Private Road)
## 5      D54             Streets, Roads, Highways (Bicycle Path, Private Road)
## 6      D52                                                  Bar / Restaurant
##   PREMISES_TYPE UCR_CODE UCR_EXT                     OFFENCE    MCI_CATEGORY
## 1     Apartment     1480     110 Administering Noxious Thing         Assault
## 2    Commercial     2120     200                         B&E Break and Enter
## 3     Apartment     1430     100                     Assault         Assault
## 4       Outside     2130     210                  Theft Over      Theft Over
## 5       Outside     1430     100                     Assault         Assault
## 6    Commercial     1420     110         Assault Bodily Harm         Assault
##   HOOD_158   NEIGHBOURHOOD_158 HOOD_140                      NEIGHBOURHOOD_140
## 1       38    Lansing-Westgate       38                  Lansing-Westgate (38)
## 2       70     South Riverdale       70                   South Riverdale (70)
## 3       74 North St.James Town       74               North St.James Town (74)
## 4      NSA                 NSA      NSA                                    NSA
## 5       69         Blake-Jones       66                          Danforth (66)
## 6      164    Wellington Place       77 Waterfront Communities-The Island (77)
##   LONG_WGS84 LAT_WGS84
## 1  -79.42590  43.75735
## 2  -79.35023  43.64629
## 3  -79.37650  43.66642
## 4  -85.48874   0.00000
## 5  -79.34484  43.67895
## 6  -79.39184  43.64664
sum(is.na(toronto))
## [1] 315
sapply(toronto, function(x) sum(is.na(x)))
##                 X                 Y          OBJECTID   EVENT_UNIQUE_ID 
##                 0                 0                 0                 0 
##       REPORT_DATE          OCC_DATE       REPORT_YEAR      REPORT_MONTH 
##                 0                 0                 0                 0 
##        REPORT_DAY        REPORT_DOY        REPORT_DOW       REPORT_HOUR 
##                 0                 0                 0                 0 
##          OCC_YEAR         OCC_MONTH           OCC_DAY           OCC_DOY 
##               105                 0               105               105 
##           OCC_DOW          OCC_HOUR          DIVISION     LOCATION_TYPE 
##                 0                 0                 0                 0 
##     PREMISES_TYPE          UCR_CODE           UCR_EXT           OFFENCE 
##                 0                 0                 0                 0 
##      MCI_CATEGORY          HOOD_158 NEIGHBOURHOOD_158          HOOD_140 
##                 0                 0                 0                 0 
## NEIGHBOURHOOD_140        LONG_WGS84         LAT_WGS84 
##                 0                 0                 0
missingCols <- select(toronto, OCC_YEAR, OCC_MONTH, OCC_DAY, OCC_DOY, OCC_DOW)
toronto <- na.omit(toronto)
(colMeans(is.na(missingCols)))*100
##   OCC_YEAR  OCC_MONTH    OCC_DAY    OCC_DOY    OCC_DOW 
## 0.03247798 0.00000000 0.03247798 0.03247798 0.00000000
sum(duplicated(toronto$EVENT_UNIQUE_ID))
## [1] 41575
toronto <- subset(toronto, !duplicated(toronto$EVENT_UNIQUE_ID))
unique(toronto$OCC_YEAR)
##  [1] 2014 2013 2012 2003 2011 2004 2010 2009 2008 2006 2000 2005 2002 2001 2015
## [16] 2007 2016 2017 2018 2019 2020 2021 2022
unique(toronto$REPORT_YEAR)
## [1] 2014 2015 2016 2017 2018 2019 2020 2021 2022
year_group <- group_by(toronto, OCC_YEAR)
crime_by_year <- summarise(year_group,
                          n = n())
crime_by_year
## # A tibble: 23 × 2
##    OCC_YEAR     n
##       <int> <int>
##  1     2000    24
##  2     2001    17
##  3     2002    17
##  4     2003    12
##  5     2004    22
##  6     2005    19
##  7     2006    12
##  8     2007    25
##  9     2008    36
## 10     2009    54
## # ℹ 13 more rows
# Getting the Count of Each category of Crime
table(unlist(toronto$MCI_CATEGORY))
## 
##         Assault      Auto Theft Break and Enter         Robbery      Theft Over 
##          144916           41642           61089           23543           10426
drops <- c("X", "Y", "OBJECTID", "UCR_CODE", "UCR_EXT", "REPORT_DATE", "REPORT_MONTH", "REPORT_DAY", "REPORT_DOY", "REPORT_DOW", "REPORT_HOUR", "OCC_DOY", "REPORT_YEAR", "DIVISION", "HOOD_158", "HOOD_140")
toronto <- toronto[, !(names(toronto) %in% drops)]
What are the total number of crimes in each year.
cases_count <- toronto %>% 
  filter(OCC_YEAR >= 2014) %>%
  count(OCC_YEAR) %>%
  group_by(OCC_YEAR)

# Getting the Count of Number of Crimes each year
ggplot(cases_count, aes(x = OCC_YEAR, y = n, label=n)) +
  geom_line() +
  geom_point() +
  geom_text(hjust=0, vjust=0) +
  labs(x = "Year", 
       y = "Total Number of Criminal Cases throughout 2014 to 2022", 
       title = "Yearwise total Criminal Cases throughout 2014 to 2022")

What are the Major Crimes in each year.

# Grouping Major Crimes from 2014 to 2022
indicator_group <- group_by(filter(toronto, OCC_YEAR >= 2014, OCC_YEAR != "NA"), MCI_CATEGORY, OCC_YEAR)
crime_by_indicator <- summarise(indicator_group, n=n())
crime_by_indicator <- crime_by_indicator[order(crime_by_indicator$n, decreasing = TRUE),]

ggplot(aes(x = reorder(MCI_CATEGORY, n), y = n), data = crime_by_indicator) +
  geom_bar(stat = 'identity', width = 0.5) +
  geom_text(aes(label = n), stat = 'identity', data = crime_by_indicator, hjust = -0.1, size = 3.5) +
  coord_flip() +
  xlab('Major Crimes') +
  ylab('Number of Occurrences') +
  ggtitle('Major Crimes from 2014 to 2022') +
  theme_bw() +
  theme(plot.title = element_text(size = 16),
        axis.title = element_text(size = 12, face = "bold")) +
  facet_wrap(vars(OCC_YEAR))

What are the different types of Assault and which one is the worst.

# Getting the crimes from Assault Category for 2014
assault <- filter(toronto[toronto$MCI_CATEGORY == 'Assault', ], OCC_YEAR >= 2014, OCC_YEAR != "NA")
assault_group <- group_by(assault, OFFENCE, OCC_YEAR)
assault_by_offence <- summarise(assault_group, n=n())
assault_by_offence <- assault_by_offence[order(assault_by_offence$n, decreasing = TRUE), ]
ggplot(aes(x = reorder(OFFENCE, n), y = n), data = assault_by_offence) +
  geom_bar(stat = 'identity', width = 0.6) +
  geom_text(aes(label = n), stat = 'identity', data = assault_by_offence, hjust = -0.1, size = 3) +
  coord_flip() +
  xlab('Types of Assault') +
  ylab('Number of Occurrences') +
  ggtitle('Assault Crimes in 2014') +
  theme_bw() +
  theme(plot.title = element_text(size = 16),
        axis.title = element_text(size = 12, face = "bold")) +
  facet_wrap(vars(OCC_YEAR))

Let’s look at the offences instead.

# Getting the Offences from 2014 to 2022
offence_group <- group_by(filter(toronto, OCC_YEAR >= 2014, OCC_YEAR != "NA"), OFFENCE, OCC_YEAR)
crime_by_offence <- summarise(offence_group, n=n())
crime_by_offence <- crime_by_offence[order(crime_by_offence$n, decreasing = TRUE), ]
ggplot(aes(x = reorder(OFFENCE, n), y = n), data = crime_by_offence) +
  geom_bar(stat = 'identity', width = 0.7) +
  geom_text(aes(label = n), stat = 'identity', data = crime_by_offence, hjust = -0.1, size = 2) +
  coord_flip() +
  xlab('Types of Offence') +
  ylab('Number of Occurrences') +
  ggtitle('Offence Types Toronto from 2014 to 2022') +
  theme_bw() +
  theme(plot.title = element_text(size = 16),
        axis.title = element_text(size = 12, face = "bold")) +
  facet_wrap(vars(OCC_YEAR))

Checking for crimes wrt time of the day.

# Crimes WRT Time of Day from 2014 to 2022
hour_group <- group_by(filter(toronto, OCC_YEAR >= 2014, OCC_YEAR != "NA"), OCC_HOUR, OCC_YEAR)
crime_hour <- summarise(hour_group, n=n())
ggplot(aes(x=OCC_HOUR, y=n), data = crime_hour) + geom_line(size = 2.5, alpha = 0.7, color = "mediumseagreen", group=1) + 
  geom_point(size = 0.5) + 
  ggtitle('Total Crimes by Hour of Day in Toronto from 2014 to 2022') +
  ylab('Number of Occurrences') +
  xlab('Hour(24-hour clock)') +
  theme_bw() +
  theme(plot.title = element_text(size = 16),
        axis.title = element_text(size = 12, face = "bold")) +
  facet_wrap(vars(OCC_YEAR))

What types of Crimes are most frequent wrt the time of the day?

# Crime Types WRT Hour of the day from 2014 to 2022
hour_crime_group <- group_by(filter(toronto, OCC_YEAR >= 2014), OCC_HOUR, OCC_YEAR, MCI_CATEGORY)
hour_crime <- summarise(hour_crime_group, n=n())
ggplot(aes(x=OCC_HOUR, y=n, color=MCI_CATEGORY), data = hour_crime) + 
  geom_line(size=1.5) + 
  ggtitle('Crime Types by Hour of Day in Toronto from 2014 to 2022') +
  ylab('Number of Occurrences') +
  xlab('Hour(24-hour clock)') +
  theme_bw() +
  theme(plot.title = element_text(size = 16),
        axis.title = element_text(size = 12, face = "bold")) +
  facet_wrap(vars(OCC_YEAR))

Where in Toronto were those crimes most likely to occur

# Crimes WRT location from 2014 to 2022
location_group <- group_by(filter(toronto, OCC_YEAR >= 2014), NEIGHBOURHOOD_158)
crime_by_location <- summarise(location_group, n=n())
crime_by_location <- crime_by_location[order(crime_by_location$n, decreasing = TRUE), ]
crime_by_location_top20 <- head(crime_by_location, 20)
ggplot(aes(x = reorder(NEIGHBOURHOOD_158, n), y = n), data = crime_by_location_top20) +
  geom_bar(stat = 'identity', width = 0.6) +
  geom_text(aes(label = n), stat = 'identity', data = crime_by_location_top20, hjust = -0.1, size = 3) +
  coord_flip() +
  xlab('Neighbourhoods') +
  ylab('Number of Occurrences') +
  ggtitle('Neighbourhoods with Most Crimes - Top 20 from 2014 to 2022') +
  theme_bw() +
  theme(plot.title = element_text(size = 16),
        axis.title = element_text(size = 12, face = "bold"))

Which are the Safest Neighborhoods to Live in 2022

location_group <- group_by(filter(toronto, OCC_YEAR == 2022), NEIGHBOURHOOD_158)
crime_by_location <- summarise(location_group, n=n())
crime_by_location <- crime_by_location[order(crime_by_location$n, decreasing = TRUE), ]
tail(crime_by_location, 5)
## # A tibble: 5 × 2
##   NEIGHBOURHOOD_158            n
##   <chr>                    <int>
## 1 Bendale South               71
## 2 Humber Heights-Westmount    71
## 3 Woodbine-Lumsden            66
## 4 Bayview Woods-Steeles       61
## 5 Guildwood                   50

Comparing Neighborhoods with top offence types

# Top Offences in Each Neighborhood from in 2022
offence_location_group <- group_by(filter(toronto, OCC_YEAR == 2022), NEIGHBOURHOOD_158, OFFENCE, OCC_YEAR)
offence_type_by_location <- summarise(offence_location_group, n=n())
offence_type_by_location <- offence_type_by_location[order(offence_type_by_location$n, decreasing = TRUE), ]
offence_type_by_location_top20 <- head(offence_type_by_location, 50)
ggplot(aes(x = NEIGHBOURHOOD_158, y=n, fill = OFFENCE), data=offence_type_by_location_top20) +
  geom_bar(stat = 'identity', position = position_dodge(), width = 0.8) +
  xlab('Neighbourhood') +
  ylab('Number of Occurrence') +
  ggtitle('Offence Type vs. Neighbourhood Toronto in 2022') + 
  theme_bw() +
  theme(plot.title = element_text(size = 16),
        axis.title = element_text(size = 12, face = "bold"),
        axis.text.x = element_text(angle = 90, hjust = 1, vjust = .4)) +
  facet_wrap(vars(OCC_YEAR))

Month Wise Crime Rates

# Major Crimes for every month from 2014 to 2022
crime_count <- filter(toronto, OCC_YEAR >= 2014) %>% 
  group_by(OCC_MONTH, MCI_CATEGORY, OCC_YEAR) %>% 
  summarise(Total = n())
crime_count$OCC_MONTH <- ordered(crime_count$OCC_MONTH, levels = c('January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'))
ggplot(crime_count, aes(OCC_MONTH, MCI_CATEGORY, fill = Total)) +
  geom_tile(size = 1, color = "white") +
  scale_fill_viridis()  +
  geom_text(aes(label=Total), color='white') +
  ggtitle("Major Crime Indicators by Month from 2014 to 2022") +
  xlab('Month') +
  theme(plot.title = element_text(size = 16), 
        axis.title = element_text(size = 12, face = "bold")) +
  facet_wrap(vars(OCC_YEAR))

Day of the Week Wise Crime Rates

# Major Crimes for every DOW from 2014 to 2022
day_count <- filter(toronto, OCC_YEAR >= 2014) %>% 
  group_by(OCC_DOW, MCI_CATEGORY, OCC_YEAR) %>% 
  summarise(Total = n())
ggplot(day_count, aes(OCC_DOW, MCI_CATEGORY, fill = Total)) +
  geom_tile(size = 1, color = "white") +
  scale_fill_viridis()  +
  geom_text(aes(label=Total), color='white') +
  ggtitle("Major Crime Indicators by Day of Week from 2014 to 2022") +
  xlab('Day of Week') +
  theme(plot.title = element_text(size = 16), 
        axis.title = element_text(size = 12, face = "bold")) +
  facet_wrap(vars(OCC_YEAR))

Homicide Rates

homicide <- read.csv('data/Homicides.csv', stringsAsFactors = F)
homicide <- filter(homicide, OCC_DATE >= 2014)
homicide$OCC_DATE <- as.Date(homicide$OCC_DATE)
year_group <- group_by(homicide, OCC_DATE, HOMICIDE_TYPE)
homicide_by_year <- summarise(year_group, n=n())
ggplot(aes(x = OCC_DATE, y=n, fill = HOMICIDE_TYPE), data = homicide_by_year) +
  geom_bar(stat = 'identity', position = position_dodge(), width = 1) +
  xlab('Year') +
  ylab('Number of Homicides') +
  ggtitle('Homicide 2014-2022') + 
  ylim(0, 5) +
  theme_bw() +
  theme(plot.title = element_text(size = 16),
        axis.title = element_text(size = 12, face = "bold"))

homicide$month <- format(as.Date(homicide$OCC_DATE) , "%B")
homicide_count <- homicide %>% group_by(OCC_YEAR, month) %>% summarise(Total = n())
homicide_count$month <- ordered(homicide_count$month, levels = c('January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'))
ggplot(homicide_count, aes(OCC_YEAR, month, fill = Total)) +
  geom_tile(size = 1, color = "white") +
  scale_fill_viridis()  +
  geom_text(aes(label=Total), color='white') +
  ggtitle("Homicides in Toronto (2014-2022)") +
  xlab('Year') +
  theme(plot.title = element_text(size = 16), 
        axis.title = element_text(size = 12, face = "bold"))

by_groups <- group_by(toronto, MCI_CATEGORY, NEIGHBOURHOOD_158)
groups <- summarise(by_groups, n=n())
groups <- groups[c("NEIGHBOURHOOD_158", "MCI_CATEGORY", "n")]
groups_wide <- spread(groups, key = MCI_CATEGORY, value = n)
groups_wide
## # A tibble: 159 × 6
##    NEIGHBOURHOOD_158 Assault `Auto Theft` `Break and Enter` Robbery `Theft Over`
##    <chr>               <int>        <int>             <int>   <int>        <int>
##  1 Agincourt North       568          314               374     182           40
##  2 Agincourt South-…     947          351               648     173          103
##  3 Alderwood             277          193               241      49           57
##  4 Annex                1993          246              1363     288          271
##  5 Avondale              355          119                98      44           39
##  6 Banbury-Don Mills     656          236               543      80           88
##  7 Bathurst Manor        445          311               254      80           34
##  8 Bay-Cloverhill        975           81               488     156          100
##  9 Bayview Village       609          217               329      53           75
## 10 Bayview Woods-St…     289          133               223      26           21
## # ℹ 149 more rows
z <- groups_wide[, -c(1,1)]
z <- z[complete.cases(z), ]
m <- apply(z, 2, mean)
s <- apply(z, 2, sd)
z <- scale(z, m, s)
wss <- (nrow(z)-1) * sum(apply(z, 2, var))
for (i in 2:20) wss[i] <- sum(kmeans(z, centers=i)$withiness)
plot(1:20, wss, type='b', xlab='Number of Clusters', ylab='Within groups sum of squares')

kc <- kmeans(z, 2)
kc
## K-means clustering with 2 clusters of sizes 17, 142
## 
## Cluster means:
##      Assault Auto Theft Break and Enter    Robbery Theft Over
## 1  2.1583428  1.0726446       2.1757297  2.0033547  2.2788979
## 2 -0.2583932 -0.1284152      -0.2604747 -0.2398382 -0.2728258
## 
## Clustering vector:
##   [1] 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 2 2 2 2 2 2 2 2 2 1
##  [38] 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##  [75] 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2
## [112] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 2 2 2 2 2 2 2 2 2 2 2 1 1 1 2 2 2 2 2 1 2
## [149] 2 2 2 2 2 1 2 2 2 1 1
## 
## Within cluster sum of squares by cluster:
## [1] 229.2655 184.7964
##  (between_SS / total_SS =  47.6 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
z1 <- data.frame(z, kc$cluster)
clusplot(z1, kc$cluster, color=TRUE, shade=F, labels=0, lines=0, main='k-Means Cluster Analysis')